Preamble
# Clear workspace
rm(list=ls()); graphics.off()
### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(skimr) # For nice data summaries
Load the data
listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz')
listings %>% head()
calendar <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/calendar.csv.gz')
calendar %>% head()
reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/reviews.csv.gz')
reviews %>% head()
Data munging/tidying
Inspect again…
data <- listings
data %>% glimpse()
Varriable selection
data %<>%
rename(y = price_all) %>%
select(y, review_scores_rating, neighbourhood_cleansed, accommodates, room_type, bathrooms,is_business_travel_ready,
number_of_reviews, cancellation_policy, host_is_superhost, host_identity_verified, bedrooms)
Filtring observations
data %<>%
drop_na(y) %>%
filter(percent_rank(y) <0.95)
Also check the categorical variables for rare types
data %>% count(room_type, sort = TRUE)
data %<>%
filter(!(room_type %in% c('Shared room', 'Hotel room')))
Misssing data
listings %<>%
mutate(across(is_character, ~ifelse(.x == "", NA, .x)))
library(VIM)
data %>%
aggr(numbers = TRUE, prop = c(TRUE, FALSE))

Looks fine, litte missing
EDA
data %>%
skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 14064
Number of columns 12
_______________________
Column type frequency:
character 3
logical 3
numeric 6
________________________
Group variables None
── Variable type: character ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 neighbourhood_cleansed 0 1 5 25 0 11 0
2 room_type 0 1 12 15 0 2 0
3 cancellation_policy 0 1 8 27 0 4 0
── Variable type: logical ───────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 is_business_travel_ready 0 1 0 FAL: 14064
2 host_is_superhost 2 1.00 0.143 FAL: 12051, TRU: 2011
3 host_identity_verified 2 1.00 0.448 FAL: 7762, TRU: 6300
── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 y 0 1 974. 365. 100 698 948 1246 1849 ▂▇▇▆▂
2 review_scores_rating 40 0.997 95.5 5.24 20 93 97 100 100 ▁▁▁▁▇
3 accommodates 0 1 3.17 1.41 1 2 3 4 12 ▇▅▁▁▁
4 bathrooms 3 1.00 1.05 0.223 0 1 1 1 5.5 ▇▁▁▁▁
5 number_of_reviews 0 1 19.1 30.0 2 5 10 22 637 ▇▁▁▁▁
6 bedrooms 5 1.00 1.46 0.780 0 1 1 2 8 ▇▃▁▁▁
library(GGally)
data %>% ggpairs()

Preprocessing
library(tidymodels)
Train and test split
data_split <- initial_split(data, prop = 0.75, strata = y)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
data_recipe <- data_train %>%
recipe(y ~.) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
Models & Workflows
Define Models
model_lm <- linear_reg(mode = 'regression') %>%
set_engine('lm')
model_xg <- boost_tree(mode = 'regression',
trees = 100,
mtry = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost")
Workflows
workflow_general <- workflow() %>%
add_recipe(data_recipe)
workflow_lm <- workflow_general %>%
add_model(model_lm)
workflow_xg <- workflow_general %>%
add_model(model_xg)
Hyperparameter Tuning
Resampling
data_resample <- data_train %>%
vfold_cv(strata = y,
v = 3,
repeats = 2)
Hyperparameter Tuning
tune_xg <-
tune_grid(
workflow_xg,
resamples = data_resample,
grid = 10
)
tune_xg %>% autoplot()

best_param_xg <- tune_xg %>% select_best(metric = 'rmse')
best_param_xg
tune_xg %>% show_best(metric = 'rmse', n = 1)
Fit models
Fit models with tuned hyperparameters
workflow_final_xg <- workflow_xg %>%
finalize_workflow(parameters = best_param_xg)
fit_lm <- workflow_lm %>%
fit(data_train)
fit_xg <- workflow_final_xg %>%
fit(data_train)
Predict
pred_collected <- tibble(
truth = data_test %>% pull(y),
base = mean(truth),
lm = fit_lm %>% predict(new_data = data_test) %>% pull(.pred),
xg = fit_xg %>% predict(new_data = data_test) %>% pull(.pred),
) %>%
pivot_longer(cols = -truth,
names_to = 'model',
values_to = '.pred')
Evaluate
Metrics
pred_collected %>%
group_by(model) %>%
rmse(truth = truth, estimate = .pred) %>%
select(model, .estimate) %>%
arrange(.estimate)
pred_collected %>%
group_by(model) %>%
rsq(truth = truth, estimate = .pred) %>%
select(model, .estimate) %>%
arrange(.estimate)
Visuals
pred_collected %>%
ggplot(aes(x = truth, y = .pred, color = model)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
labs(
x = "Truth",
y = "Predicted price",
color = "Type of model"
)

Variable Importance
Linear model
fit_lm %>%
# pluck(".workflow", 1) %>%
pull_workflow_fit() %>%
vip::vip(num_features = 10)

XGBoost
fit_xg %>%
# pluck(".workflow", 1) %>%
pull_workflow_fit() %>%
vip::vip(num_features = 10)

LS0tCnRpdGxlOiAiV29ya3Nob3A6IEV4cGxvcmluZyB0aGUgSW5zaWRlQWlyQm5CIGRhdGFzZXQgLSBQcmVkaWN0aW9uIgphdXRob3I6ICJEYW5pZWwgUy4gSGFpbiAoZHNoQGJ1c2luZXNzLmFhdS5kaykiCmRhdGU6ICJVcGRhdGVkIGByIGZvcm1hdChTeXMudGltZSgpLCAnJUIgJWQsICVZJylgIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgICB0b2M6IHRydWUKICAgIHRvY19kZXB0aDogMgogICAgdG9jX2Zsb2F0OgogICAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICB0aGVtZTogZmxhdGx5Ci0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CiMgS25pdHIgb3B0aW9ucwojIyMgR2VuZXJpYyBwcmVhbWJsZQpTeXMuc2V0ZW52KExBTkcgPSAiZW4iKSAjIEZvciBlbmdsaXNoIGxhbmd1YWdlCm9wdGlvbnMoc2NpcGVuID0gNSkgIyBUbyBkZWFjdGl2YXRlIGFubm95aW5nIHNjaWVudGlmaWMgbnVtYmVyIG5vdGF0aW9uCgojIHJtKGxpc3Q9bHMoKSk7IGdyYXBoaWNzLm9mZigpICMgZ2V0IHJpZCBvZiBldmVyeXRoaW5nIGluIHRoZSB3b3Jrc3BhY2UKaWYgKCFyZXF1aXJlKCJrbml0ciIpKSBpbnN0YWxsLnBhY2thZ2VzKCJrbml0ciIpOyBsaWJyYXJ5KGtuaXRyKSAjIEZvciBkaXNwbGF5IG9mIHRoZSBtYXJrZG93bgoKIyMjIEtuaXRyIG9wdGlvbnMKa25pdHI6Om9wdHNfY2h1bmskc2V0KHdhcm5pbmc9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIG1lc3NhZ2U9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIGZpZy5hbGlnbj0iY2VudGVyIgogICAgICAgICAgICAgICAgICAgICApCmBgYAoKIyMgUHJlYW1ibGUKCmBgYHtyfQojIENsZWFyIHdvcmtzcGFjZQpybShsaXN0PWxzKCkpOyBncmFwaGljcy5vZmYoKSAKYGBgCgpgYGB7cn0KIyMjIExvYWQgcGFja2FnZXMKbGlicmFyeSh0aWR5dmVyc2UpICMgQ29sbGVjdGlvbiBvZiBhbGwgdGhlIGdvb2Qgc3R1ZmYgbGlrZSBkcGx5ciwgZ2dwbG90MiBlY3QuCmxpYnJhcnkobWFncml0dHIpICMgRm9yIGV4dHJhLXBpcGluZyBvcGVyYXRvcnMgKGVnLiAlPD4lKQpsaWJyYXJ5KHNraW1yKSAjIEZvciBuaWNlIGRhdGEgc3VtbWFyaWVzCmBgYAoKCiMgTG9hZCB0aGUgZGF0YQoKYGBge3J9Cmxpc3RpbmdzIDwtIHJlYWRfY3N2KCdodHRwOi8vZGF0YS5pbnNpZGVhaXJibmIuY29tL2Rlbm1hcmsvaG92ZWRzdGFkZW4vY29wZW5oYWdlbi8yMDIwLTA2LTI2L2RhdGEvbGlzdGluZ3MuY3N2Lmd6JykKbGlzdGluZ3MgJT4lIGhlYWQoKQpgYGAKCmBgYHtyfQpjYWxlbmRhciA8LSByZWFkX2NzdignaHR0cDovL2RhdGEuaW5zaWRlYWlyYm5iLmNvbS9kZW5tYXJrL2hvdmVkc3RhZGVuL2NvcGVuaGFnZW4vMjAyMC0wNi0yNi9kYXRhL2NhbGVuZGFyLmNzdi5neicpCmNhbGVuZGFyICU+JSBoZWFkKCkKYGBgCgpgYGB7cn0KcmV2aWV3cyA8LSByZWFkX2NzdignaHR0cDovL2RhdGEuaW5zaWRlYWlyYm5iLmNvbS9kZW5tYXJrL2hvdmVkc3RhZGVuL2NvcGVuaGFnZW4vMjAyMC0wNi0yNi9kYXRhL3Jldmlld3MuY3N2Lmd6JykKcmV2aWV3cyAlPiUgaGVhZCgpCmBgYAoKIyBEYXRhIG11bmdpbmcvdGlkeWluZwoKSW5zcGVjdCBhZ2Fpbi4uLgoKYGBge3J9CmRhdGEgPC0gbGlzdGluZ3MgCmBgYAoKYGBge3J9CmRhdGEgJT4lIGdsaW1wc2UoKQpgYGAKCiMjIyBWYXJpYWJsZSB0cmFuc2Zvcm1hdGlvbnMKCmBgYHtyfQpkYXRhICU8PiUKICBtdXRhdGUocHJpY2UgPSBwcmljZSAlPiUgcGFyc2VfbnVtYmVyKCksCiAgICAgICAgIGNsZWFuaW5nX2ZlZSA9IHBhcnNlX251bWJlcihjbGVhbmluZ19mZWUpLAogICAgICAgICBwcmljZV9hbGwgPSBwcmljZSArIGNsZWFuaW5nX2ZlZSkgJT4lCiAgZmlsdGVyKG51bWJlcl9vZl9yZXZpZXdzID49IDIpCmBgYAoKIyMjIFZhcnJpYWJsZSBzZWxlY3Rpb24KCmBgYHtyfQpkYXRhICU8PiUgCiAgcmVuYW1lKHkgPSBwcmljZV9hbGwpICU+JQogIHNlbGVjdCh5LCByZXZpZXdfc2NvcmVzX3JhdGluZywgbmVpZ2hib3VyaG9vZF9jbGVhbnNlZCwgYWNjb21tb2RhdGVzLCByb29tX3R5cGUsIGJhdGhyb29tcyxpc19idXNpbmVzc190cmF2ZWxfcmVhZHksCiAgICAgICAgIG51bWJlcl9vZl9yZXZpZXdzLCBjYW5jZWxsYXRpb25fcG9saWN5LCBob3N0X2lzX3N1cGVyaG9zdCwgaG9zdF9pZGVudGl0eV92ZXJpZmllZCwgYmVkcm9vbXMpIAoKYGBgCgojIyMgRmlsdHJpbmcgb2JzZXJ2YXRpb25zCgpgYGB7cn0KZGF0YSAlPD4lIAogIGRyb3BfbmEoeSkgJT4lCiAgZmlsdGVyKHBlcmNlbnRfcmFuayh5KSA8MC45NSkKYGBgCgpBbHNvIGNoZWNrIHRoZSBjYXRlZ29yaWNhbCB2YXJpYWJsZXMgZm9yIHJhcmUgdHlwZXMKCmBgYHtyfQpkYXRhICU+JSBjb3VudChyb29tX3R5cGUsIHNvcnQgPSBUUlVFKQpgYGAKCmBgYHtyfQpkYXRhICU8PiUKICBmaWx0ZXIoIShyb29tX3R5cGUgJWluJSBjKCdTaGFyZWQgcm9vbScsICdIb3RlbCByb29tJykpKQpgYGAKCgojIyMgTWlzc3NpbmcgZGF0YQoKYGBge3J9CiMgR2V0IHJpZCBvZiBlbXB0eSBzdHJpbmdzCmxpc3RpbmdzICU8PiUKICAgIG11dGF0ZShhY3Jvc3MoaXNfY2hhcmFjdGVyLCB+aWZlbHNlKC54ID09ICIiLCBOQSwgLngpKSkKYGBgCgpgYGB7cn0KbGlicmFyeShWSU0pICMgVG8gaW5zcGN0IG1pc3NpbmcgZGF0YSBwYXR0ZXJuCmBgYAoKYGBge3J9CmRhdGEgJT4lCiAgYWdncihudW1iZXJzID0gVFJVRSwgcHJvcCA9IGMoVFJVRSwgRkFMU0UpKQpgYGAKTG9va3MgZmluZSwgbGl0dGUgbWlzc2luZwoKIyBFREEKCmBgYHtyfQpkYXRhICU+JQogIHNraW0oKQpgYGAKCmBgYHtyLCBmaWcuaGVpZ2h0PTEyLCBmaWcud2lkdGg9MTJ9CmxpYnJhcnkoR0dhbGx5KQpkYXRhICU+JSBnZ3BhaXJzKCkgCmBgYAoKIyBQcmVwcm9jZXNzaW5nCgpgYGB7cn0KbGlicmFyeSh0aWR5bW9kZWxzKQpgYGAKCiMjIyBUcmFpbiBhbmQgdGVzdCBzcGxpdAoKYGBge3J9CmRhdGFfc3BsaXQgPC0gaW5pdGlhbF9zcGxpdChkYXRhLCBwcm9wID0gMC43NSwgc3RyYXRhID0geSkKCmRhdGFfdHJhaW4gPC0gZGF0YV9zcGxpdCAgJT4lICB0cmFpbmluZygpCmRhdGFfdGVzdCA8LSBkYXRhX3NwbGl0ICU+JSB0ZXN0aW5nKCkKYGBgCgpgYGB7cn0KZGF0YV9yZWNpcGUgPC0gZGF0YV90cmFpbiAlPiUKICByZWNpcGUoeSB+LikgJT4lCiAgc3RlcF9jZW50ZXIoYWxsX251bWVyaWMoKSwgLWFsbF9vdXRjb21lcygpKSAlPiUKICBzdGVwX3NjYWxlKGFsbF9udW1lcmljKCksIC1hbGxfb3V0Y29tZXMoKSkgJT4lCiAgc3RlcF9kdW1teShhbGxfbm9taW5hbCgpLCAtYWxsX291dGNvbWVzKCksIG9uZV9ob3QgPSBUUlVFKSAKYGBgCgoKIyBNb2RlbHMgJiBXb3JrZmxvd3MKCiMjIyBEZWZpbmUgTW9kZWxzCgpgYGB7cn0KbW9kZWxfbG0gPC0gbGluZWFyX3JlZyhtb2RlID0gJ3JlZ3Jlc3Npb24nKSAlPiUKICBzZXRfZW5naW5lKCdsbScpIApgYGAKCmBgYHtyfQptb2RlbF94ZyA8LSBib29zdF90cmVlKG1vZGUgPSAncmVncmVzc2lvbicsIAogICAgICAgICAgICAgICAgICAgICAgIHRyZWVzID0gMTAwLAogICAgICAgICAgICAgICAgICAgICAgIG10cnkgPSB0dW5lKCksIAogICAgICAgICAgICAgICAgICAgICAgIG1pbl9uID0gdHVuZSgpLCAKICAgICAgICAgICAgICAgICAgICAgICB0cmVlX2RlcHRoID0gdHVuZSgpLCAKICAgICAgICAgICAgICAgICAgICAgICBsZWFybl9yYXRlID0gdHVuZSgpCiAgICAgICAgICAgICAgICAgICAgICAgKSAlPiUKICBzZXRfZW5naW5lKCJ4Z2Jvb3N0IikgCmBgYAoKIyMjIFdvcmtmbG93cwpgYGB7cn0Kd29ya2Zsb3dfZ2VuZXJhbCA8LSB3b3JrZmxvdygpICU+JQogIGFkZF9yZWNpcGUoZGF0YV9yZWNpcGUpIAoKd29ya2Zsb3dfbG0gPC0gd29ya2Zsb3dfZ2VuZXJhbCAlPiUKICBhZGRfbW9kZWwobW9kZWxfbG0pCgp3b3JrZmxvd194ZyA8LSB3b3JrZmxvd19nZW5lcmFsICU+JQogIGFkZF9tb2RlbChtb2RlbF94ZykKYGBgCgojIEh5cGVycGFyYW1ldGVyIFR1bmluZwoKIyMjIFJlc2FtcGxpbmcKCmBgYHtyfQpkYXRhX3Jlc2FtcGxlIDwtIGRhdGFfdHJhaW4gJT4lIAogIHZmb2xkX2N2KHN0cmF0YSA9IHksCiAgICAgICAgICAgdiA9IDMsCiAgICAgICAgICAgcmVwZWF0cyA9IDIpCmBgYAoKIyMjIEh5cGVycGFyYW1ldGVyIFR1bmluZwoKYGBge3J9CnR1bmVfeGcgPC0KICB0dW5lX2dyaWQoCiAgICB3b3JrZmxvd194ZywKICAgIHJlc2FtcGxlcyA9IGRhdGFfcmVzYW1wbGUsCiAgICBncmlkID0gMTAKICApCmBgYAoKYGBge3J9CnR1bmVfeGcgJT4lIGF1dG9wbG90KCkKYGBgCgpgYGB7cn0KYmVzdF9wYXJhbV94ZyA8LSB0dW5lX3hnICU+JSBzZWxlY3RfYmVzdChtZXRyaWMgPSAncm1zZScpCmJlc3RfcGFyYW1feGcKYGBgCgpgYGB7cn0KdHVuZV94ZyAlPiUgc2hvd19iZXN0KG1ldHJpYyA9ICdybXNlJywgbiA9IDEpCmBgYAoKIyBGaXQgbW9kZWxzCgojIyMgRml0IG1vZGVscyB3aXRoIHR1bmVkIGh5cGVycGFyYW1ldGVycwoKYGBge3J9CndvcmtmbG93X2ZpbmFsX3hnIDwtIHdvcmtmbG93X3hnICU+JQogIGZpbmFsaXplX3dvcmtmbG93KHBhcmFtZXRlcnMgPSBiZXN0X3BhcmFtX3hnKQpgYGAKCmBgYHtyfQpmaXRfbG0gPC0gd29ya2Zsb3dfbG0gJT4lCiAgZml0KGRhdGFfdHJhaW4pCgpmaXRfeGcgPC0gd29ya2Zsb3dfZmluYWxfeGcgJT4lCiAgZml0KGRhdGFfdHJhaW4pCmBgYAoKIyMjIFByZWRpY3QKCmBgYHtyfQpwcmVkX2NvbGxlY3RlZCA8LSB0aWJibGUoCiAgdHJ1dGggPSBkYXRhX3Rlc3QgJT4lIHB1bGwoeSksCiAgYmFzZSA9IG1lYW4odHJ1dGgpLAogIGxtID0gZml0X2xtICU+JSBwcmVkaWN0KG5ld19kYXRhID0gZGF0YV90ZXN0KSAlPiUgcHVsbCgucHJlZCksCiAgeGcgPSBmaXRfeGcgJT4lIHByZWRpY3QobmV3X2RhdGEgPSBkYXRhX3Rlc3QpICU+JSBwdWxsKC5wcmVkKSwKICApICU+JSAKICBwaXZvdF9sb25nZXIoY29scyA9IC10cnV0aCwKICAgICAgICAgICAgICAgbmFtZXNfdG8gPSAnbW9kZWwnLAogICAgICAgICAgICAgICB2YWx1ZXNfdG8gPSAnLnByZWQnKQpgYGAKCiMgRXZhbHVhdGUKCiMjIyBNZXRyaWNzCgpgYGB7cn0KcHJlZF9jb2xsZWN0ZWQgJT4lCiAgZ3JvdXBfYnkobW9kZWwpICU+JQogIHJtc2UodHJ1dGggPSB0cnV0aCwgZXN0aW1hdGUgPSAucHJlZCkgJT4lCiAgc2VsZWN0KG1vZGVsLCAuZXN0aW1hdGUpICU+JQogIGFycmFuZ2UoLmVzdGltYXRlKQpgYGAKCmBgYHtyfQpwcmVkX2NvbGxlY3RlZCAlPiUKICBncm91cF9ieShtb2RlbCkgJT4lCiAgcnNxKHRydXRoID0gdHJ1dGgsIGVzdGltYXRlID0gLnByZWQpICU+JQogIHNlbGVjdChtb2RlbCwgLmVzdGltYXRlKSAlPiUKICBhcnJhbmdlKC5lc3RpbWF0ZSkKYGBgCiMjIyBWaXN1YWxzCgpgYGB7cn0KcHJlZF9jb2xsZWN0ZWQgJT4lCiAgZ2dwbG90KGFlcyh4ID0gdHJ1dGgsIHkgPSAucHJlZCwgY29sb3IgPSBtb2RlbCkpICsKICBnZW9tX2FibGluZShsdHkgPSAyLCBjb2xvciA9ICJncmF5ODAiLCBzaXplID0gMS41KSArCiAgZ2VvbV9wb2ludChhbHBoYSA9IDAuNSkgKwogIGxhYnMoCiAgICB4ID0gIlRydXRoIiwKICAgIHkgPSAiUHJlZGljdGVkIHByaWNlIiwKICAgIGNvbG9yID0gIlR5cGUgb2YgbW9kZWwiCiAgKQpgYGAKIyMjIFZhcmlhYmxlIEltcG9ydGFuY2UKCkxpbmVhciBtb2RlbApgYGB7cn0KZml0X2xtICU+JSAKIyAgcGx1Y2soIi53b3JrZmxvdyIsIDEpICU+JSAgIAogIHB1bGxfd29ya2Zsb3dfZml0KCkgJT4lIAogIHZpcDo6dmlwKG51bV9mZWF0dXJlcyA9IDEwKQpgYGAKWEdCb29zdApgYGB7cn0KZml0X3hnICU+JSAKIyAgcGx1Y2soIi53b3JrZmxvdyIsIDEpICU+JSAgIAogIHB1bGxfd29ya2Zsb3dfZml0KCkgJT4lIAogIHZpcDo6dmlwKG51bV9mZWF0dXJlcyA9IDEwKQpgYGAK